############
############
############	This file creates the true population of 1,000,000 people living in
############	25 different units. The first ten are large units (7%) and the latter
############	15 units are small (3%).
############	The individual variables (normally income, age, education, ...) is modeled 
############	here as gamma1, gamma2, gamma3.
############
############	Lucas Leemann, leemann@ucl.ac.uk
############	January 2015 
############
############
############
############
############



library(MASS)
start.time <-  Sys.time()

	# Population Size
	N.obs <- 1000000
	
	sdL.ind <- 1
	sdL.cont <- 1
	SD.tuning <- 1

	#########################################################################
	#########################################################################
	#########################################################################
	set.seed(123)


	muL <- rep(0,N.obs)
	
	
	###############################	
	# semi-correlated draws, rho=0.4
	rho <- 0.4
	SigmaL <- matrix(c(1,rho,rho,rho,rho,1,rho,rho,rho,rho,1,rho,rho,rho,rho,1),4,4)
	GAMMA <- matrix(NA,N.obs,4)
	for (i in 1:N.obs){
		GAMMA[i,] <- mvrnorm(1, rep(0,4), SigmaL)
	}
	
	gamma1 <- GAMMA[,1]
	gamma2 <- GAMMA[,2]
	gamma3 <- GAMMA[,3]
	
	ind.error <- rnorm(N.obs,0,sdL.ind)
		
	# transfer to discrete
	# gamma1
	gamma1.d <- rep(NA,N.obs)
	gamma1.d[gamma1< -1] <- 1 
	gamma1.d[gamma1> -1 & gamma1 < 0 ] <- 2
	gamma1.d[gamma1> 0 & gamma1 < 1 ] <- 3
	gamma1.d[gamma1 > 1] <- 4
	# gamma2
	gamma2.d <- rep(NA,N.obs)
	gamma2.d[gamma2< -1] <- 1 
	gamma2.d[gamma2> -1 & gamma2 < 0 ] <- 2
	gamma2.d[gamma2> 0 & gamma2 < 1 ] <- 3
	gamma2.d[gamma2 > 1] <- 4				
	# gamma3
	gamma3.d <- rep(NA,N.obs)
	gamma3.d[gamma3< -1] <- 1 
	gamma3.d[gamma3> -1 & gamma3 < 0 ] <- 2
	gamma3.d[gamma3> 0 & gamma3 < 1 ] <- 3
	gamma3.d[gamma3 > 1] <- 4
	
	# true population
	GAMMA.d <- cbind(gamma1.d,gamma2.d,gamma3.d)
	true.type <- rep(NA,N.obs)
	
	# object *raster* contains all 64 possible ideal types (4^3)	
	raster <- 	rbind(c(1,1,1), c(1,1,2), c(1,1,3), c(1,1,4), c(1,2,1), c(1,2,2),c(1,2,3), c(1,2,4), c(1,3,1), c(1,3,2), c(1,3,3), c(1,3,4), c(1,4,1), c(1,4,2), c(1,4,3), c(1,4,4), c(2,1,1), c(2,1,2), c(2,1,3), c(2,1,4), c(2,2,1), c(2,2,2), c(2,2,3), c(2,2,4), c(2,3,1), c(2,3,2), c(2,3,3), c(2,3,4), c(2,4,1), c(2,4,2), c(2,4,3), c(2,4,4), c(3,1,1), c(3,1,2), c(3,1,3), c(3,1,4), c(3,2,1), c(3,2,2), c(3,2,3), c(3,2,4), c(3,3,1), c(3,3,2), c(3,3,3), c(3,3,4), c(3,4,1), c(3,4,2), c(3,4,3), c(3,4,4),c(4,1,1), c(4,1,2), c(4,1,3), c(4,1,4), c(4,2,1), c(4,2,2), c(4,2,3), c(4,2,4), c(4,3,1), c(4,3,2), c(4,3,3), c(4,3,4), c(4,4,1), c(4,4,2), c(4,4,3), c(4,4,4))
	
	# *population* measures exactly which ideal type a simulated citizen is
	population <- rep(NA,N.obs)
	for (i in 1:N.obs){
		testi <- rep(NA,64)
		for (j in 1:64){
			testL <- GAMMA.d[i,]==raster[j,]
			ifelse(testL==c(TRUE, TRUE, TRUE), testi[j] <- 1, testi[j] <- 0)  
		}
	population[i] <- which(testi==TRUE)
	 if (i %% 50000 == 0) print(paste("Finished MC Simulation iteration",i))
	}
	
	summary(population)
	
	unit <- c(rep(1,70000), rep(2,70000), rep(3,70000), rep(4,70000), rep(5,70000), rep(6,70000), rep(7,70000), rep(8,70000), rep(9,70000), rep(10,70000), rep(11,20000), rep(12,20000), rep(13,20000), rep(14,20000), rep(15,20000), rep(16,20000), rep(17,20000), rep(18,20000), rep(19,20000), rep(20,20000), rep(21,20000), rep(22,20000), rep(23,20000), rep(24,20000), rep(25,20000))
	unit.c <- c(rep(1,64),rep(2,64), rep(3,64), rep(4,64), rep(5,64), rep(6,64), rep(7,64), rep(8,64), rep(9,64), rep(10,64), rep(11,64), rep(12,64), rep(13,64), rep(14,64), rep(15,64), rep(16,64), rep(17,64), rep(18,64), rep(19,64), rep(20,64), rep(21,64), rep(22,64), rep(23,64), rep(24,64), rep(25,64))
	
	# creating a census file
	# we want to know for each of the 25 units what there structure is, i.e. 
	# how many people per ideal type they have
	census.data <- rep(NA,64*25)
	for (q in 1:25){
		popL <- population[unit==q]
		a <- (q-1)*64 + 1
		b <- q*64	
		census.data[a:b]<- table(popL)
		}
	# creating census in % data
	census.data.percent <- rep(NA, 64*25)
	for (k in 1:25){
		census.data.w <- census.data[unit.c==k]
		a <- (k-1)*64 + 1
		b <- k*64	
		census.data.percent[a:b] <- census.data.w/sum(census.data.w)
	}

	summary(census.data.percent)
	
	# collective contribution
	x.cont <- seq(-.2,.2,length.out=25) + 0.85
	X.CONT <- rep(NA, N.obs)
	for (s in 1:N.obs){
		X.CONT[s] <- x.cont[unit[s]] + rnorm(1,0,sdL.cont)
	}
	
	ind.contribution <- gamma1 + gamma2 + gamma3 + ind.error
	ylatent <- X.CONT + ind.contribution
	support.pred <- pnorm(ylatent, sd=SD.tuning)
	support.01 <- as.numeric(support.pred>0.5)
	GAMMA.population <- cbind(ind.contribution,unit, GAMMA.d,population,support.pred,ylatent,support.01)
	
	independent.draws <- list(census.data.percent, census.data,GAMMA.population)

# saving all the information 
save(independent.draws, file="draws_corr 0.4 B085.RData")


end.time <-  Sys.time()
cat(paste("Job finished at:",end.time))
dur <- end.time-start.time
cat(paste("Job took:",dur))
